Testing and plotting captured data for further classification

# Delete all variables
rm( list = ls() )

Used libaries:

# Import libaries
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## 
## Attache Paket: 'plotly'
## Das folgende Objekt ist maskiert 'package:ggplot2':
## 
##     last_plot
## Das folgende Objekt ist maskiert 'package:stats':
## 
##     filter
## Das folgende Objekt ist maskiert 'package:graphics':
## 
##     layout
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attache Paket: 'mosaic'
## Das folgende Objekt ist maskiert 'package:Matrix':
## 
##     mean
## Das folgende Objekt ist maskiert 'package:plotly':
## 
##     do
## Die folgenden Objekte sind maskiert von 'package:dplyr':
## 
##     count, do, tally
## Das folgende Objekt ist maskiert 'package:ggplot2':
## 
##     stat
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(isotree)
library(party)
## Lade nötiges Paket: grid
## Lade nötiges Paket: mvtnorm
## Lade nötiges Paket: modeltools
## Lade nötiges Paket: stats4
## Lade nötiges Paket: strucchange
## Lade nötiges Paket: zoo
## 
## Attache Paket: 'zoo'
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     as.Date, as.Date.numeric
## Lade nötiges Paket: sandwich
library(rpart)
library(caret)
## 
## Attache Paket: 'caret'
## Das folgende Objekt ist maskiert 'package:mosaic':
## 
##     dotPlot
# Used to scale time
library(lubridate)
## Lade nötiges Paket: timechange
## 
## Attache Paket: 'lubridate'
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     date, intersect, setdiff, union
library(scales)
## 
## Attache Paket: 'scales'
## Das folgende Objekt ist maskiert 'package:mosaic':
## 
##     rescale
library(hms)
## 
## Attache Paket: 'hms'
## Das folgende Objekt ist maskiert 'package:lubridate':
## 
##     hms

Read in data and inspect

read_data = read.csv("Idle_Run_Tobias_Egger.csv")
motion_data <- data.frame(read_data)
  • There are 2893 observations

  • This value is available under the column n

inspect(motion_data)
## 
## categorical variables:  
##       name     class levels    n missing
## 1  Creator character      1 2893       0
## 2 Activity character      2 2893       0
##                                    distribution
## 1 Tobias Egger (100%)                          
## 2 Run (50.1%), Idle (49.9%)                    
## 
## quantitative variables:  
##        name   class           min            Q1        median           Q3
## 1        ID integer  1.000000e+00  7.240000e+02  1.447000e+03 2.170000e+03
## 2    Sample integer  1.000000e+00  1.300000e+01  2.500000e+01 3.800000e+01
## 3 timestamp numeric  1.672264e+12  1.672265e+12  1.672265e+12 1.672332e+12
## 4         X numeric -2.091795e+01 -1.339564e+00 -2.709400e-02 1.158740e+00
## 5         Y numeric -1.174263e+01  1.241818e+00  3.385210e+00 5.582491e+00
## 6         Z numeric -1.014845e+01  6.115233e+00  8.706345e+00 1.025009e+01
##            max         mean           sd    n missing
## 1 2.893000e+03 1.447000e+03 8.352815e+02 2893       0
## 2 5.000000e+01 2.555617e+01 1.438819e+01 2893       0
## 3 1.672332e+12 1.672298e+12 3.357830e+07 2893       0
## 4 1.741553e+01 3.877723e-03 3.360706e+00 2893       0
## 5 1.808569e+01 3.400054e+00 3.745857e+00 2893       0
## 6 4.263883e+01 8.313343e+00 5.274758e+00 2893       0

Activity data distribution

For tobias egger, activity idle, there are 1445 observations

For tobias egger, activity run, there are 1448 observations

group_by(motion_data, Activity) %>%
ggplot(aes(x = Activity, fill=Activity)) + 
geom_bar(color = c('light blue', 'dark orange')) +
facet_grid(.~Creator  ) +
scale_fill_manual(values=c('light blue', 'dark orange')) +
scale_colour_manual(values=c('light blue', 'dark orange'))

activity_count <- group_by(motion_data, Activity) %>%
  summarize(count=n())

activity_count

Use activities from Tobias Egger

motion_data_te <- subset(motion_data, Creator == "Tobias Egger")

Activity-wise distribution

Extract hours from timestamp

Link: https://www.appsloveworld.com/r/100/298/density-plot-based-on-time-of-the-day

Link: https://stackoverflow.com/questions/13456241/convert-unix-epoch-to-date-object

density_data <- data.frame(motion_data_te)

# convert character to POSIXct
density_data$timestamp <- as.POSIXct(density_data$timestamp/1000, origin="1970-01-01")
# extract hour and minute:
density_data$time <- hms::hms(second(density_data$timestamp), minute(density_data$timestamp), hour(density_data$timestamp))  
# convert to POSIXct again since ggplot does not work with class hms.
density_data$time <- as.POSIXct(density_data$time)
density_data$date <-as.Date(as.POSIXct(density_data$timestamp, origin="1970-01-01"))
density_data

Plot timestamp density

ggplot(density_data, aes(x=time, fill=Activity)) +
  geom_histogram(bins=(sqrt(length(density_data$Activity))),fill="white",color="black",aes(y=..density..)) +  
  geom_density(alpha=.3) + 
  scale_x_datetime(breaks = date_breaks("1 hours"), labels=date_format("%H:%m"), expand = c(0,0))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

Axis activity-wise distribution

Density for X axis

ggplot(motion_data_te, aes(x=X, fill=Activity)) +
  geom_histogram(bins=(sqrt(length(motion_data_te$Activity))),fill="white",color="black",aes(y=..density..)) +  
  geom_density(alpha=.3)

Density for Y axis

ggplot(motion_data_te, aes(x=Y, fill=Activity)) +
  geom_histogram(bins=(sqrt(length(motion_data_te$Activity))),fill="white",color="black",aes(y=..density..)) +  
  geom_density(alpha=.3)

Density for Z axis

ggplot(motion_data_te, aes(x=Z, fill=Activity)) +
  geom_histogram(bins=(sqrt(length(motion_data_te$Activity))),fill="white",color="black",aes(y=..density..)) +  
  geom_density(alpha=.3)

Compare axis data points over time

ggplot(motion_data_te, aes(timestamp, X)) +
geom_line()

ggplot(motion_data_te, aes(timestamp, Y)) +
geom_line()

ggplot(motion_data_te, aes(timestamp, Z)) +
geom_line()

Split the axis up per activity

idle_activity = subset(motion_data_te, Activity == "Idle")

idle_plot <- group_by(idle_activity, Activity) %>%
  ggplot(aes(x=timestamp)) +
  labs( x = "Timestamp", y = "Acceleration") +
  geom_line(aes(y = X), color="dark green", alpha = 0.8) +
  geom_line(aes(y = Y), color="light blue", alpha = 0.8) +
  geom_line(aes(y = Z), color="dark orange", alpha = 0.8) 
  
  
run_activity = subset(motion_data_te, Activity == "Run")

run_plot <- group_by(run_activity, Activity) %>%
  ggplot(aes(x=timestamp)) +
  labs( x = "Timestamp", y = "Acceleration") +
  geom_line(aes(y = X), color="dark green", alpha = 0.8) +
  geom_line(aes(y = Y), color="light blue", alpha = 0.8) +
  geom_line(aes(y = Z), color="dark orange", alpha = 0.8) 

Idle activity

Dark green: X

Light blue: Y

Dark orange: Z

ggplotly(idle_plot)

Run activity

Dark green: X

Light blue: Y

Dark orange: Z

ggplotly(run_plot)

Basic X, Y, Z plot for motion data

In the first three plots we can already distinguish between idle and run.

Third plot:

dark green: X

light blue: Y

dark orange: Z

plot_data <- subset(motion_data_te, select=c(X, Y, Z))

plot(motion_data_te$X)

plot(motion_data_te$Y)

plot(motion_data_te$Z)

plot(plot_data, col = c('dark green', 'light blue', 'dark orange'))

Use the pairplot for quantitative variables

Light blue: Activity running

Dark orange: Activity idle

We can also see here that timestamp and sample correlates with 0,869 the best.

For the axis, z and y correlates with 0.079 the best.

pair_data <- subset(motion_data_te, select=c(Sample, timestamp, X, Y, Z))

ggpairs(data=pair_data, aes(color = motion_data_te$Activity), title="Species pair plot with quantiative variables")  +
  scale_fill_manual(values=c('light blue', 'dark orange')) +
  scale_colour_manual(values=c('light blue', 'dark orange'))

To get all the outliers of X, Y, Z for idle, run,

I created a custom method to find the outliers plotted the IndividualID below

For timestamp, X there are 196 outliers

findoutlier <- function(x) {
  return(x < quantile(x, .25) - 1.5*IQR(x) | x > quantile(x, .75) + 1.5*IQR(x))
}

data_out <- 
      group_by(motion_data_te, Activity) %>%
      mutate(outliers = ifelse(findoutlier(timestamp) | findoutlier(X),  ID, NA))

indexes <-data_out %>%
  filter(outliers != "")
indexes

For timestamp, Y there are 103 outliers

motion_data_te <- 
      group_by(motion_data_te, Activity) %>%
      mutate(outliers = ifelse(findoutlier(timestamp) | findoutlier(Y),  ID, NA))

motion_data_te %>%
  filter(outliers != "")
motion_data_te
motion_data_te %>%
  filter(outliers != "")

Boxplot timestamp, y for activity idle and run

group_by(motion_data_te, Activity) %>%
ggplot(aes(x = timestamp, y = Y, fill = Activity)) + 
  geom_boxplot( color = "black") +
  facet_grid(Creator ~ Activity)  +
  geom_text(aes(label=""), na.rm=TRUE,hjust=1) + 
  scale_fill_manual(values=c('light blue', 'dark orange'))+
  theme(axis.text.x = element_text(angle = 90))

For timestamp, Z there are 290 outliers

motion_data_te <- 
      group_by(motion_data_te, Activity) %>%
      mutate(outliers = ifelse(findoutlier(timestamp) | findoutlier(Z),  ID, NA))

motion_data_te %>%
  filter(outliers != "")

Boxplot timestamp, z for activity idle and run

group_by(motion_data_te, Activity) %>%
ggplot(aes(x = timestamp, y = Z, fill = Activity)) + 
  geom_boxplot( color = "black") +
  facet_grid(Creator ~ Activity)  +
  geom_text(aes(label=""), na.rm=TRUE,hjust=1) + 
  scale_fill_manual(values=c('light blue', 'dark orange'))+
  theme(axis.text.x = element_text(angle = 90))

In total there are 589 outliers.

Boxplot timestamp, x for activity idle and run

group_by(motion_data_te, Activity) %>%
ggplot(aes(x = timestamp, y = X, fill = Activity)) + 
  geom_boxplot() +
  facet_grid(Creator ~ Activity)  +
  #geom_text(aes(label="Y"), na.rm=TRUE,hjust=1) + 
  scale_fill_manual(values=c('light blue', 'dark orange'))+
  theme(axis.text.x = element_text(angle = 90))

Without outliers -> Didn’t work:

#data <- subset(motion_data_te, Activity == "Idle")

# get the values of the outliers
outliers_X <- boxplot(motion_data_te$X, plot = F)$out

# find the row numbers of the outliers
index_out <- match(outliers_X, motion_data_te$X)

Gather y axis outliers

# get the values of the outliers
outliers_Y <- boxplot(motion_data_te$Y, plot = F)$out

# find the row numbers of the outliers & add them to the vector "index_out"
index_out <- c(index_out, match(outliers_Y, motion_data_te$Y))

Gather z axis outliers

# get the values of the outliers
outliers_Z <- boxplot(motion_data_te$Z, plot = F)$out

# find the row numbers of the outliers & add them to the vector "index_out"
index_out <- c(index_out, match(outliers_Z, motion_data_te$Z))

Print outliers

# show the positions of the outliers in X, Y, Z
index_out
##   [1]    8    9   11   14   16   18   21   22   23   24   25   26   29   31   32
##  [16]   33   36   44   50   51   57   67   69   70   71   73   75   76   77   81
##  [31]   82   89   94   97   99  100  102  107  114  123  126  127  130  132  134
##  [46]  135  142  149  150  175  179  180  181  183  186  188  189  191  192  193
##  [61]  194  195  196  197  199  200  202  234  237  239  241  242  244  245  247
##  [76]  249  250  252  253  255  256  257  258  282  296  299  301  302  305  308
##  [91]  309  316  317  320  322  341  347  349  350  352  353  354  356  358  359
## [106]  360  362  364  366  368  369  373  374  380  404  405  406  407  409  410
## [121]  411  414  415  416  417  418  419  420  421  425  426  429  432  433  458
## [136]  459  460  469  518  525  533  548  549  550  573  574  580  644  749  756
## [151]  757  759  761  764  765  767  768  776  777  778  779  784  785  788  807
## [166]  811  812  814  815  817  818  819  822  823  824  825  826  827  830  832
## [181]  833  834  836  837  838  839  840  841  842  843  845  846  847  892  894
## [196]  916  919  923  924  925  928  929  930  931  932  935  941  943  945  946
## [211]  947  951  952  953  959  961  962  963  967  974  998  999 1001 1002 1003
## [226] 1005 1007 1008 1009 1012 1014 1015 1016 1019 1022 1024 1026 1029 1055 1057
## [241] 1058 1059 1060 1061 1065 1106 1113 1114 1118 1122 1123 1124 1125 1126 1128
## [256] 1129 1130 1131 1132 1133 1136 1148 1161 1163 1165 1170 1171 1172 1174 1175
## [271] 1177 1178 1179 1182 1184 1185 1186 1188 1190 1193 1194 1197 1224 1232 1236
## [286] 1238 1239 1240 1249 1284 1285 1291 1292 1298 1337 1338 1340 1341 1342 1343
## [301] 1345 1349 1350 1351 1352 1353 1354 1357 1358 1362 1364 1365 1366 1389 1390
## [316] 1392 1393 1396 1398 1399 1400 1403 1413 1415 1420 1422 1423 1428 1429 1762
## [331] 1766 2215 2230 2231 2232 2241 2242 2243 2244 2245 2246 2247 2248 2249 2448
## [346] 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525
## [361] 2526 2527 2807 2808    6    9   16   23   27   30   68   71   79   94  128
## [376]  131  150  183  187  202  258  295  298  321  357  363  430  571  572  629
## [391]  781  793  841  956 1006 1027 1048 1051 1056 1080 1118 1123 1160 1167 1175
## [406] 1183 1228 1338 1349 1423 1426 1430 1638 1639 1640 1641 1642 1643 1644 1645
## [421] 1646 1647 1648 1649 1650 1651 1652 2038 2039 2040 2157 2158 2159 2160 2161
## [436] 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176
## [451] 2177 2178 2179 2180 2181 2182 2183 2184 2619 2620 2621 2622 2623   16   23
## [466]   41   52   70   81   85   94   98  100  102  106  124  125  126  132  133
## [481]  135  139  147  154  162  170  181  183  191  196  198  209  213  238  239
## [496]  242  246  248  250  252  254  256  262  266  268  270  274  276  278  296
## [511]  298  302  306  308  310  313  314  315  325  329  333  334  347  348  181
## [526]  350  354  355  356  357  361  363  365  367  374  378  379  390  394  408
## [541]  410  416  417  419  421  423  425  429  438  456  461  478  481  485  489
## [556]  542  546  558  638  749  752  760  763  766  770  777  809  810  815  816
## [571]  818  820  823  827  831  833  838  840  844  847  895  926  933  937  941
## [586]  944  964  999 1006 1007 1008 1010 1018 1020 1031 1048 1051 1059 1066 1076
## [601] 1106 1111 1118 1119 1127 1130 1132 1134 1150 1171 1175 1179 1183 1184 1187
## [616] 1222 1247 1250 1251 1253 1303 1338 1340 1343 1345 1347 1349 1351 1355 1357
## [631] 1363 1365 1367 1371 1375 1379 1383 1394 1404 1411 1419 1431 1663 1664 1665
## [646] 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1906
## [661] 1907 1908 1909 1910 1911 1912 1952 1953 1954 1955 1956 2056 2057 2058 2059
## [676] 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074
## [691] 2075 2076 2077 2078 2079 2080 2081 2082 2083 2165 2166 2167 2168 2169 2170
## [706] 2171 2172 2173 2174 2175 2176 2177 2178 2502 2503 2504 2505 2506 2507 2508
## [721] 2509 2510 2511 2512 2513 2601 2657 2658 2659 2660 2661 2662 2663 2821 2822
## [736] 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2875 2876 2877 2878
## [751] 2879 2880 2881 2882 2883 2884 2885 2886 2887
# remove outliers
data_no_out <- motion_data_te[-index_out,]
data_no_out
boxplot(data_no_out$X, plot = T)

boxplot(data_no_out$Y, plot = T)

boxplot(data_no_out$Z, plot = T)

Preprocessing

  • delete column ID, outliers
  • encode Creator and Activity

Delete column ID

data_clf <- data.frame(motion_data_te)

data_clf <- data_clf[,!names(data_clf) %in% c("ID", "outliers", "Creator", "Sample", "timestamp")]
names(data_clf)
## [1] "Activity" "X"        "Y"        "Z"

Encoding of categorical data -> Creator deleted due to NaN after scaling

#data_clf$Creator <- as.numeric(factor(data_clf$Creator))
##data_clf$Activity <- as.numeric(factor(data_clf$Activity))
head(data_clf)

Scale data -> For classification I got the same result with, without scaling

data_clf_scaled <- data.frame(data_clf)
#data_clf_scaled$Sample <- scale(data_clf_scaled$Sample)
#data_clf_scaled$timestamp <- scale(data_clf_scaled$timestamp)
data_clf_scaled$X <- scale(data_clf_scaled$X)
data_clf_scaled$Y <- scale(data_clf_scaled$Y)
data_clf_scaled$Z <- scale(data_clf_scaled$Z)
head(data_clf_scaled)

Build and evaluate a classifier

Data Partitioning

Train: 80 % Test: 20 %

Used later for classification

set.seed(7)

# Create a list of 80% of the rows in the original dataset we can use for training
train_index<-createDataPartition(data_clf$Activity, p =0.80, list = FALSE)

# Select 20% of the data for testing
test_data<-data_clf[-train_index, ]

# Use the remaining 80% of data to train and validate the models
train_data<-data_clf[train_index, ]

Used only for isolation_trees

set.seed(7)

# Create a list of 80% of the rows in the original dataset we can use for training
train_index_iso<-createDataPartition(data_clf$Activity, p =0.80, list = FALSE)

# Select 20% of the data for testing
test_data_iso<-data_clf[-train_index_iso, ]

# Use the remaining 80% of data to train and validate the models
train_data_iso<-data_clf[train_index_iso, ]

Used to visualize the ID in plotly

set.seed(7)

# Create a list of 80% of the rows in the original dataset we can use for training
train_index_id<-createDataPartition(motion_data_te$Activity, p =0.80, list = FALSE)

# Select 20% of the data for testing
test_data_id<-motion_data[-train_index_id, ]

# Use the remaining 80% of data to train and validate the models
train_data_id<-motion_data[train_index_id, ]

Find further outliers with isolation forest model:

ntrees - Number of trees. Defaults to 50.

isotree <- isolation.forest(train_data, ndim=1, ntrees=10, nthreads=1)
isotree
## Isolation Forest model
## Consisting of 10 trees
## Numeric columns: 3
## Categorical columns: 1
## Size: 494.71 KiB

Predict average train depth of the isolation forest:

avg_depth_train <- predict(isotree, train_data_iso, type="avg_depth")
par(train_data_iso)
## Warning in par(train_data_iso): "Activity" ist kein Grafikparameter
## Warning in par(train_data_iso): "X" ist kein Grafikparameter
## Warning in par(train_data_iso): "Y" ist kein Grafikparameter
## Warning in par(train_data_iso): "Z" ist kein Grafikparameter
plot(train_data_iso, avg_depth_train, col="darkred",
     main="Average isolation depth Train Data")

As we can see here, the average depth for the train data is at 18.29

summary(avg_depth_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.20   16.85   19.16   18.29   20.30   22.33

Predict average test depth of the isolation forest:

avg_depth_test <- predict(isotree, test_data_iso, type="avg_depth")
par(mar = test_data_iso)
## Warning in par(mar = test_data_iso): "Activity" ist kein Grafikparameter
## Warning in par(mar = test_data_iso): "X" ist kein Grafikparameter
## Warning in par(mar = test_data_iso): "Y" ist kein Grafikparameter
## Warning in par(mar = test_data_iso): "Z" ist kein Grafikparameter
plot(test_data_iso, avg_depth_test, col="darkred",
     main="Average isolation depth Test Data")

As we can see here, the average depth for the test data is at 18.187

summary(avg_depth_test)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.617  16.710  19.188  18.187  20.313  22.085

The average anomaly score for the train data is at 0.4247 or 45.10 %

Link: https://www.kaggle.com/code/norealityshows/outlier-detection-with-isolation-forest-in-r

anomaly_score_train <- predict(isotree, train_data_iso, type = "score")
summary(anomaly_score_train)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3477  0.3828  0.4040  0.4247  0.4506  0.7819

Based on the anomaly score, we can find the outliers and plot the result:

Here the threshold is set to 0.5

If the value is close to 1 the data point is likely an anomaly

if the value is smaller than 0.5, then the data point is likely to be a regular point

#predict outliers within dataset
train_data_iso$pred <- anomaly_score_train
train_data_iso$outlier <- as.factor(ifelse(train_data_iso$pred >=0.5, "outlier", "normal"))

Train data with and without outliers

head(train_data_iso)
tail(train_data_iso)

Ggplotly with train data anomalies:

Anomaly <- train_data_iso$outlier

train_anomalies_x <- group_by(train_data_iso, Activity) %>%
  ggplot(aes(x = train_data_id$timestamp, y = X, color = Anomaly, text = paste("ID:", train_data_id$ID))) + 
  geom_point(shape = 1, alpha = 0.8) +
  labs( x = "Timestamp", y = "X Axis") +
  labs(alpha = "", colour="Legend") +
  facet_grid(train_data_id$Creator ~ Activity) +
  scale_color_manual(values=c("#42f548", "#f54842")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

train_anomalies_y <- group_by(train_data_iso, Activity) %>%
  ggplot(aes(x = train_data_id$timestamp, y = Y, color = Anomaly, text = paste("ID:", train_data_id$ID))) + 
  geom_point(shape = 1, alpha = 0.8) +
  labs( x = "Timestamp", y = "Y Axis") +
  labs(alpha = "", colour="Legend") +
  facet_grid(train_data_id$Creator ~ Activity) +
  scale_color_manual(values=c("#42f548", "#f54842")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

train_anomalies_z <- group_by(train_data_iso, Activity) %>%
  ggplot(aes(x = train_data_id$timestamp, y = Z, color = Anomaly, text = paste("ID:", train_data_id$ID))) + 
  geom_point(shape = 1, alpha = 0.8) +
  labs( x = "Timestamp", y = "Z Axis") +
  labs(alpha = "", colour="Legend") +
  facet_grid(train_data_id$Creator ~ Activity) +
  scale_color_manual(values=c("#42f548", "#f54842")) +
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90)) 
ggplotly(train_anomalies_x)
ggplotly(train_anomalies_y)
ggplotly(train_anomalies_z)

As we can see here, there are in total 385 outliers for all two activities (threshold = 0.5):

  • Activity idle has 191 outliers and 965 normal data points

  • Activity run has 194 outliers and 965 normal data points

train_data_anomalies <- data.frame(train_data_iso)

group_by(train_data_anomalies, Activity, outlier) %>% summarize(
  count = n()
)
## `summarise()` has grouped output by 'Activity'. You can override using the
## `.groups` argument.

Further we can also print out the outlier data rows:

train_data_outliers <- train_data_iso %>%filter(train_data_anomalies$outlier == "outlier")
train_data_outliers
train_data_normal <- train_data_iso %>%filter(train_data_anomalies$outlier == "normal")
train_data_normal

The average anomaly score for the test data is at 0.4272 or 42.72 %

Link: https://www.kaggle.com/code/norealityshows/outlier-detection-with-isolation-forest-in-r

anomaly_score_test <- predict(isotree, test_data_iso, type = "score")
summary(anomaly_score_test)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3517  0.3825  0.4034  0.4272  0.4535  0.6974

Based on the anomaly, we can find the outliers and plot the result:

#predict outliers within dataset
test_data_iso$pred <- anomaly_score_test
test_data_iso$outlier <- as.factor(ifelse(test_data_iso$pred >=0.50, "outlier", "normal"))

Test data with and without outliers

head(test_data_iso)
tail(test_data_iso)

Ggplotly with test data anomalies:

Anomaly <- test_data_iso$outlier

test_anomalies_x <- group_by(test_data_iso, Activity) %>%
  ggplot(aes(x = test_data_id$timestamp, y = X, color = Anomaly, text = paste("ID:", test_data_id$ID))) + 
  geom_point(shape = 1, alpha = 0.8) +
  labs( x = "Timestamp", y = "X Axis") +
  labs(alpha = "", colour="Legend") +
  facet_grid(test_data_id$Creator ~ Activity) +
  scale_color_manual(values=c("#42f548", "#f54842")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

test_anomalies_y <- group_by(test_data_iso, Activity) %>%
  ggplot(aes(x = test_data_id$timestamp, y = Y, color = Anomaly, text = paste("ID:", test_data_id$ID))) + 
  geom_point(shape = 1, alpha = 0.8) +
  labs( x = "Timestamp", y = "Y Axis") +
  labs(alpha = "", colour="Legend") +
  facet_grid(test_data_id$Creator ~ Activity) +
  scale_color_manual(values=c("#42f548", "#f54842")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

test_anomalies_z <- group_by(test_data_iso, Activity) %>%
  ggplot(aes(x = test_data_id$timestamp, y = Z, color = Anomaly, text = paste("ID:", test_data_id$ID))) + 
  geom_point(shape = 1, alpha = 0.8) +
  labs( x = "Timestamp", y = "Z Axis") +
  labs(alpha = "", colour="Legend") +
  facet_grid(test_data_id$Creator ~ Activity) +
  scale_color_manual(values=c("#42f548", "#f54842")) +
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90)) 
ggplotly(test_anomalies_x)
ggplotly(test_anomalies_y)
ggplotly(test_anomalies_z)

As we can see here, there are in total 107 outliers for all two activities (threshold = 0.5):

  • Activity idle has 63 outliers and 226 normal data points

  • Activity run has 44 outliers and 245 normal data points

test_data_anomalies <- data.frame(test_data_iso)

group_by(test_data_anomalies, Activity, outlier) %>% summarize(
  count = n()
)
## `summarise()` has grouped output by 'Activity'. You can override using the
## `.groups` argument.

Further we can also print out the outlier data rows:

test_data_outliers <- test_data_iso %>%filter(test_data_anomalies$outlier == "outlier")
test_data_outliers
test_data_normal <- test_data_iso %>%filter(test_data_anomalies$outlier == "normal")
test_data_normal

So in total the isolation tree detected 285 outliers for the train data and 107 outliers for the test data.

Compared to the boxplot analysis there are in total 492 outliers.

We can also save the outliers from the sensor data for further tests.

anomalies <- data.frame(data_clf)

anomaly_score <- predict(isotree, newdata = anomalies, type = "score")
summary(anomaly_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3477  0.3827  0.4039  0.4252  0.4516  0.7819
#predict outliers within dataset
anomalies$pred <- anomaly_score
anomalies$outlier <- as.factor(ifelse(anomalies$pred >=0.50, "outlier", "normal"))

outliers <- data_clf %>%filter(anomalies$outlier == "outlier")
outliers
no_outliers <- data_clf %>%filter(anomalies$outlier == "normal")
no_outliers
write.csv(no_outliers, "Idle_Run_Tobias_Egger_no_outliers.csv", row.names = FALSE)

Train with cross validation:

Accuracy was at first at 100 % because sample and timestamp was not removed

Now cross validation achieves 72.17 % for the train data
set.seed(10)

control_par <- trainControl(method = "cv", number=10)

model_cv <- train(Activity~.,
                      data=train_data, 
                      method="rpart", 
                      trControl = control_par,
                      metric="Accuracy"
                      )

model_cv
## CART 
## 
## 2315 samples
##    3 predictor
##    2 classes: 'Idle', 'Run' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2083, 2083, 2083, 2083, 2084, 2085, ... 
## Resampling results across tuning parameters:
## 
##   cp         Accuracy   Kappa     
##   0.0449827  0.7217987  0.44367984
##   0.1730104  0.6613192  0.32285551
##   0.2621107  0.5365730  0.07240014
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0449827.
model_cv$finalModel
## n= 2315 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 2315 1156 Run (0.4993521 0.5006479)  
##   2) X>=-2.041005 1887  792 Idle (0.5802862 0.4197138)  
##     4) X< 1.579366 1399  448 Idle (0.6797713 0.3202287) *
##     5) X>=1.579366 488  144 Run (0.2950820 0.7049180) *
##   3) X< -2.041005 428   61 Run (0.1425234 0.8574766) *
Train with bootstrap achieves 76.04 % for the train data
# train a Classification and Regression Trees (CART)
set.seed(34437) #34056

control_par <- trainControl(method = "boot", number=1)

model_boot <- train(Activity~., 
                  data=train_data, 
                  method="rpart",
                  trControl = control_par,
                  metric="Accuracy")

model_boot
## CART 
## 
## 2315 samples
##    3 predictor
##    2 classes: 'Idle', 'Run' 
## 
## No pre-processing
## Resampling: Bootstrapped (1 reps) 
## Summary of sample sizes: 2315 
## Resampling results across tuning parameters:
## 
##   cp         Accuracy   Kappa    
##   0.0449827  0.7604994  0.5212733
##   0.1730104  0.7400681  0.4802199
##   0.2621107  0.6447219  0.2915295
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0449827.
model_boot$finalModel
## n= 2315 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 2315 1156 Run (0.4993521 0.5006479)  
##   2) X>=-2.041005 1887  792 Idle (0.5802862 0.4197138)  
##     4) X< 1.579366 1399  448 Idle (0.6797713 0.3202287) *
##     5) X>=1.579366 488  144 Run (0.2950820 0.7049180) *
##   3) X< -2.041005 428   61 Run (0.1425234 0.8574766) *

Therefore we take the bootstrap model

## Generate predictions
y_hats_train <- predict(object=model_boot) 
        
## Print the accuracy
accuracy <- mean(y_hats_train == train_data$Activity)*100
accuracy
## [1] 71.79266

Confusion Matrix train data:

951 idle were correctly classified, 448 wrong

711 run were correctly classified, 205 wrong

tab1 <- table(Predicted = y_hats_train,
Actual = train_data$Activity)
tab1
##          Actual
## Predicted Idle Run
##      Idle  951 448
##      Run   205 711
sum(diag(tab1))/sum(tab1)
## [1] 0.7179266
classified_data <- data_clf %>%filter(data_clf$Activity == y_hats_train)
## Warning in `==.default`(data_clf$Activity, y_hats_train): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
classified_data
plot(model_boot, main="Model Accuracies with CART")

Extract the variable importance using varImp() to understand which variables came out to be useful.

varimp_cart <- varImp(model_boot)
plot(varimp_cart, main="Variable Importance with CART")

  • Decision tree for activity
# Basic plot for a decision tree
  plot(model_cv$finalModel,branch = T, margin = 0.1)
  text(model_cv$finalModel)

Accuracy on testing data: 51.74946 %
## Generate predictions
y_hats_test <- predict(object=model_boot) 
        
## Print the accuracy
accuracy <- mean(y_hats_test == test_data$Activity)*100
## Warning in `==.default`(y_hats_test, test_data$Activity): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in `==.default`(y_hats_test, test_data$Activity): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
accuracy
## [1] 51.74946
classified_data <- data_clf %>%filter(data_clf$Activity == y_hats_test)
## Warning in `==.default`(data_clf$Activity, y_hats_test): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
##       ist kein Vielfaches der Länge des kürzeren Objektes
classified_data
Further predict data without outliers, to test if the accuracy can be increased:
read_data = read.csv("Idle_Run_Tobias_Egger_no_outliers.csv")
motion_data_te_no_outliers <- data.frame(read_data)
set.seed(7)

# Create a list of 80% of the rows in the original dataset we can use for training
train_index<-createDataPartition(motion_data_te_no_outliers$Activity, p =0.80, list = FALSE)

# Select 20% of the data for testing
test_data<-motion_data_te_no_outliers[-train_index, ]

# Use the remaining 80% of data to train and validate the models
train_data<-motion_data_te_no_outliers[train_index, ]

Cross validation achieves 72.28 % for the train data -> Only a little bit better

set.seed(10)

control_par <- trainControl(method = "cv", number=10)

model_cv_iso <- train(Activity~.,
                      data=train_data, 
                      method="rpart", 
                      trControl = control_par,
                      metric="Accuracy"
                      )

model_cv_iso
## CART 
## 
## 2038 samples
##    3 predictor
##    2 classes: 'Idle', 'Run' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1834, 1834, 1834, 1834, 1835, 1834, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05677291  0.7228542  0.4479339
##   0.17629482  0.6726779  0.3495759
##   0.24402390  0.5819309  0.1629842
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05677291.

Bootstrap achieves 75.03 % for the train data -> 1 % accuracy decrease

# train a Classification and Regression Trees (CART)
set.seed(34437) #34056

control_par <- trainControl(method = "boot", number=1)

model_boot_iso <- train(Activity~., 
                  data=train_data, 
                  method="rpart",
                  trControl = control_par,
                  metric="Accuracy")

model_boot_iso
## CART 
## 
## 2038 samples
##    3 predictor
##    2 classes: 'Idle', 'Run' 
## 
## No pre-processing
## Resampling: Bootstrapped (1 reps) 
## Summary of sample sizes: 2038 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05677291  0.7503374  0.5057702
##   0.17629482  0.6396761  0.2970214
##   0.24402390  0.6396761  0.2970214
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05677291.
Test how good the model is on unlabeled data:
read_data = read.csv("Idle_Run_Tobias_Egger_no_labels.csv")
unlabeled_data <- data.frame(read_data)

head(unlabeled_data)
unlabeled_data <- unlabeled_data[,!names(unlabeled_data) %in% c("ID", "Creator", "Sample", "timestamp")]
names(unlabeled_data)
## [1] "Activity" "X"        "Y"        "Z"
## Generate predictions
predictions <- predict(object=model_boot, newdata = unlabeled_data) 
unlabeled_data$Activity = predictions
unlabeled_data
write.csv(unlabeled_data, "Idle_Run_Tobias_Egger_Test_Data_Results.csv", row.names = FALSE)

Conclusion:

OUtliers don’t always need to be removed

-> e.g. isotree: Threshold could be more optimized to get better results

-> Isotree detected 285 outliers for the train data and 107 outliers for the test data

Final classification model so far: bootstrap -> Achieves 76.04 % on train data and 51.74946 % on test data